Load pacakges
library(readr)
library(tidyverse)
library(ggplot2)
library(ggthemes)
library(grid)
library(gridExtra)
library(DT)
Prepare data
# load data
df <- read_csv("df.csv")
# define data type
df %>%
mutate(
user_id = as.factor(user_id),
tweet_id = as.factor(tweet_id),
friend_id = as.factor(account_id)
) %>%
dplyr::select(-account_id) -> df
# what is the maximum or last number of user_friends_count?
df %>%
dplyr::select(user_id, user_friends_count) %>%
distinct() %>%
group_by(user_id) %>%
mutate(max_friends_count = max(user_friends_count)) %>%
dplyr::select(-user_friends_count) %>%
distinct() -> max_data
# max_data: user_id - max_friends_count
# merge this 'max_data' into df
df %>%
merge(max_data, by="user_id") -> df
# define x-axis: number of tweets collected during one week
df %>%
group_by(user_id) %>%
count(tweet_id) %>%
mutate(
x = cumsum(n)
) %>%
dplyr::select(
user_id, tweet_id, x
) -> df_for_x
df %>%
inner_join(df_for_x, by=c("user_id", "tweet_id")) -> df
# define y-axis: count how many distinct accounts are in the tweets (numerator)
# make a fraction for y-axis (max_friends_count as denominator)
df %>%
arrange(user_id,desc(-x)) %>%
group_by(user_id) %>%
mutate(
numerator = cumsum(!duplicated(friend_id)),
y = numerator / max_friends_count
) -> df2
# df2 is the final data for drawing plots
# plot 1.
df2 %>%
group_by(user_id) %>%
ggplot(aes(x=x, y=y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("Number of Tweets Collected") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 10)
In the data frame (df2), there are 60 unique users.
df2 %>%
ggplot(aes(x=x, y=y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("Number of Tweets Collected") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 10) +
facet_wrap(~user_id, ncol = 6, scales="fixed")
Let’s redraw the plot but by separating the samples into smaller chunks. I also allow scales of the x-axis to vary for each user.
df2 %>%
mutate(numeric_user_id = as.integer(user_id)) %>%
filter(numeric_user_id < 31) %>% # from 1~30
ggplot(aes(x=x, y=y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("Number of Tweets Collected") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 5) +
facet_wrap(~user_id, nrow=10, ncol=6, scales="free_x") +
ggtitle("First 30 users")
df2 %>%
mutate(numeric_user_id = as.integer(user_id)) %>%
filter(numeric_user_id > 30) %>% # from 31~60
ggplot(aes(x=x, y=y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("Number of Tweets Collected") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 5) +
facet_wrap(~user_id, nrow=10, ncol=6, scales="free_x") +
ggtitle("Second 30 users")
It seems some people follow very few accounts. Let’s check distribution of the friends count.
df2 %>%
distinct(user_id, max_friends_count) %>%
ggplot(aes(max_friends_count)) +
geom_histogram(bins=80) + theme_few() +
xlab("Friends Count (# of friends)") +
ylab("Frequency (# of users)") +
ggtitle("Histogram of Friends Count") -> d1
df2 %>%
distinct(user_id, max_friends_count) %>%
ggplot(aes(max_friends_count)) +
geom_histogram(bins=80) +
theme_few() +
xlab("(Log) Friends Count (# of friends)") +
ylab("Frequency (# of users)") +
ggtitle("Histogram of (Log) Friends Count") +
scale_x_log10(n.breaks=10, label = scales::label_number(accuracy = 1)) -> d2
df2 %>%
distinct(user_id, max_friends_count) %>%
ggplot(aes(x=max_friends_count)) +
geom_boxplot(outlier.color = 'red', outlier.shape=8) +
scale_y_discrete( ) +
theme_few() + xlim(c(0, 5000)) +
labs(title = "Boxplot of Friends Count",
x = "Friends Count", y = "") -> d3
df2 %>%
distinct(user_id, max_friends_count) %>%
mutate(log_friends_count = log(max_friends_count)) %>%
ggplot(aes(x=log_friends_count)) +
geom_boxplot(outlier.color = 'red', outlier.shape=8) +
scale_y_discrete( ) +
xlim(c(0, 10)) +
theme_few() +
labs(title = "Boxplot of (Log) Friends Count",
x = "(Log) Friends Count",
y = " ") -> d4
grid.arrange(d1, d2, d3, d4, ncol=2)
df2 %>%
distinct(user_id, max_friends_count) %>%
arrange(-desc(max_friends_count)) -> table_dta
datatable(table_dta,
caption = "Print User-Friends Count by Ascending Order",
filter="top")
With these users with very few friends in mind, let’s draw aggregated plots.
# aggregate plot: mean of y-axis by each point of x
df2 %>%
group_by(x) %>%
summarize(y = mean(y)) %>%
ungroup() %>%
ggplot(aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("Number of Tweets Collected") +
ylab("Mean of distinct accounts / # of friends (%)") +
scale_x_continuous(n.breaks = 10) -> ag1
ag1
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# aggregate plot: mean of y-axis by each point of x
df2 %>%
group_by(x) %>%
summarize(y = mean(y)) %>%
ungroup() %>%
ggplot(aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("(Log) Number of Tweets Collected") +
ylab("Mean of distinct accounts / # of friends (%)") +
scale_x_log10(n.breaks=10, label = scales::label_number(accuracy = 1)) -> ag2
ag2
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
What happens to Plot 6 and Plot 7 if I remove the ones with very few friends (as seen in Plot 5)?
df2 %>%
# remove users with 5 friends or fewer
filter(user_id != "1615794190662701056" & user_id !="1429638571481309184" & user_id != "777808023938928640") %>%
group_by(x) %>%
summarize(y = mean(y)) %>%
ungroup() %>%
ggplot(aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("Number of Tweets Collected") +
ylab("Mean of distinct accounts / # of friends (%)") +
scale_x_continuous(n.breaks = 10) +
geom_vline(xintercept = 5000, linetype=2, color="red", alpha=0.5) +
geom_vline(xintercept = 15000, linetype=3, color="blue", alpha=0.5) -> ag3
ag3
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
df2 %>%
filter(user_id != "1615794190662701056" & user_id !="1429638571481309184" & user_id != "777808023938928640") %>%
group_by(x) %>%
summarize(y = mean(y)) %>%
ungroup() %>%
ggplot(aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("(Log) Number of Tweets Collected") +
ylab("Mean of distinct accounts / # of friends (%)") +
scale_x_log10(n.breaks=10, label = scales::label_number(accuracy = 1)) -> ag4
ag4
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Let’s bind plots 6~9 together to faciliate comparison:
grid.arrange(ag1, ag2, ag3, ag4, ncol=2)